In [4]:
import os
import torch
import kagglehub
import numpy as np
import polars as pl
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from dota import Dota2
from heroes import get_heroes
from model import Dota2Autoencoder
from dataset import get_dataset
from leagues import get_tier_one
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import DBSCAN
from itertools import product
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.cluster import OPTICS
path = kagglehub.dataset_download("bwandowando/dota-2-pro-league-matches-2023")
heroes, hero_cols, dict_attributes, dict_roles = get_heroes(path)
tier_one_matches = get_tier_one(path)
n_heroes = len(heroes.collect())
player_cols = []
hero_cols = []
ti_2024, p_cols, h_cols = get_dataset(path, specific_patches=[56])
player_cols.append(p_cols)
hero_cols.append(h_cols)
ti_2023, p_cols, h_cols = get_dataset(path, specific_patches=[53])
player_cols.append(p_cols)
hero_cols.append(h_cols)
ti_2022, p_cols, h_cols = get_dataset(path, specific_patches=[51])
player_cols.append(p_cols)
hero_cols.append(h_cols)
ti_2021, p_cols, h_cols = get_dataset(path, specific_patches=[49, 48])
player_cols.append(p_cols)
hero_cols.append(h_cols)
matches_ti_2024 = ti_2024.join(tier_one_matches, on="league_id", how="left").filter(
pl.col("league_name") == "The International 2024")
matches_ti_2023 = ti_2023.join(tier_one_matches, on="league_id", how="left").filter(
pl.col("league_name") == "The International 2023")
matches_ti_2022 = ti_2022.join(tier_one_matches, on="league_id", how="left").filter(
pl.col("league_name") == "The International 2022")
matches_ti_2021 = ti_2021.join(tier_one_matches, on="league_id", how="left").filter(
pl.col("league_name") == "The International 2021")
internationals = [matches_ti_2024, matches_ti_2023,
matches_ti_2022, matches_ti_2021]
datasets = [ti_2024, ti_2023, ti_2022, ti_2021]
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
data_np = matches_ti_2024.to_numpy()
hero_pick_embedding_dim: int = 16
hero_role_embedding_dim: int = 8
n_players: int = 5
n_bans: int = 7
latent_dim: int = 2
hidden_layers: list[int] = [256, 128, 64, 32]
dropout: float = 0.3
learning_rate: float = 0.001
autoencoders: list[Dota2Autoencoder] = []
for ti, dataset in enumerate(datasets):
autoencoder = Dota2Autoencoder(
dict_roles=dict_roles,
hero_cols=hero_cols[ti],
player_cols=player_cols[ti],
n_heroes=n_heroes,
hero_pick_embedding_dim=hero_pick_embedding_dim,
hero_role_embedding_dim=hero_role_embedding_dim,
n_players=n_players,
n_bans=n_bans,
latent_dim=latent_dim,
hidden_layers=hidden_layers,
dropout=dropout,
learning_rate=learning_rate,
name=f"ti_{2024 - ti}_autoencoder",
)
if (os.path.exists(f"ti_{2024 - ti}_best_model.h5")):
print(f"Loading pre-trained model for TI {2024 - ti}")
autoencoder.load_model(f"ti_{2024 - ti}_autoencoder.h5")
else:
train_df, val_df, test_df = dataset.sample(fraction=0.7, seed=42), dataset.sample(
fraction=0.15, seed=42), dataset.sample(fraction=0.15, seed=42)
print(f"Training autoencoder for TI {2024 - ti}")
print(f"Train shape: {train_df.shape}, Val shape: {val_df.shape}, Test shape: {test_df.shape}")
print(f"Hero columns: {hero_cols[ti]}")
print(f"Player columns: {player_cols[ti]}")
autoencoder.train_data(train_df, val_df, epochs=100, patience=20,
best_model_filename=f"ti_{2024 - ti}_best_model.h5", silent=True)
autoencoder.save_loss_history(
f"ti_{2024 - ti}_loss_history.csv", silent=True)
autoencoder.save_model(f"ti_{2024 - ti}_autoencoder.h5", silent=True)
accuracy, mse, _, _ = autoencoder.test_model(test_df)
print(
f"TI {2024 - ti} - Accuracy: {accuracy}, MSE: {mse}, Loss: {autoencoder.best_val_loss}")
print("=" * 50)
autoencoders.append(autoencoder)
for ti_matches, autoencoder in product(internationals, autoencoders):
ti = ti_matches.select('league_name').unique().item()
print(f"Processing {ti_matches.shape[0]} matches from {ti}")
print(f"Autoencoder name: {autoencoder.name}")
autoencoder.eval()
encoded = []
total_similarity = 0
matches_encoded = []
autoencoder.eval()
with torch.no_grad():
for batch in ti_matches.iter_slices(32):
data_np = batch.to_numpy()
try:
matches_encoded.append(batch.select("match_id").to_numpy())
latent, reconstructed = autoencoder.encode(
data_np, min(32, batch.shape[0]), ti_matches.columns)
similarity = torch.cosine_similarity(autoencoder.flatten(
data_np, min(32, batch.shape[0]), ti_matches.columns), reconstructed)
total_similarity += similarity.sum().item()
encoded.append(latent.cpu().numpy())
except RuntimeError as e:
print(f"RuntimeError: {e}")
print("Check if the input shape matches the model's expected input size.")
print(f"Expected input size: {autoencoder.input_dim if hasattr(autoencoder, 'input_dim') else 'unknown'}")
print(f"Actual input size: {data_np.shape[1]}")
raise
print(f"Total similarity: {total_similarity / len(ti_matches)}")
latent_space = np.concatenate(encoded, axis=0)
print(f"Latents shape: {latent_space.shape}")
cluster_labels = kmeans.fit_predict(latent_space)
print(f"Cluster labels: {np.unique(cluster_labels)}")
for cluster_id in np.unique(cluster_labels):
cluster_points = latent_space[cluster_labels == cluster_id]
plt.scatter(cluster_points[:, 0], cluster_points[:, 1],
label=f"Cluster {cluster_id}", alpha=0.7)
plt.xlabel("Latent X")
plt.ylabel("Latent Y")
plt.title("Scatter das Latentes por Cluster")
plt.legend()
plt.show()
# Clustering usando DBSCAN no espaço latente
dbscan = DBSCAN(eps=0.01, min_samples=10)
dbscan_labels = dbscan.fit_predict(latent_space)
print(f"DBSCAN labels: {np.unique(dbscan_labels)}")
plt.figure(figsize=(8, 6))
for cluster_id in np.unique(dbscan_labels):
mask = dbscan_labels == cluster_id
if cluster_id == -1:
plt.scatter(latent_space[mask, 0], latent_space[mask, 1], label="RuÃdo", alpha=0.5, c="k")
else:
plt.scatter(latent_space[mask, 0], latent_space[mask, 1], label=f"Cluster {cluster_id}", alpha=0.7)
plt.xlabel("Latent X")
plt.ylabel("Latent Y")
plt.title("DBSCAN nos Latentes")
plt.legend()
plt.show()
# Clustering usando AgglomerativeClustering
agglo = AgglomerativeClustering(n_clusters=n_clusters)
agglo_labels = agglo.fit_predict(latent_space)
print(f"Agglomerative labels: {np.unique(agglo_labels)}")
plt.figure(figsize=(8, 6))
for cluster_id in np.unique(agglo_labels):
mask = agglo_labels == cluster_id
plt.scatter(latent_space[mask, 0], latent_space[mask, 1], label=f"Cluster {cluster_id}", alpha=0.7)
plt.xlabel("Latent X")
plt.ylabel("Latent Y")
plt.title("Agglomerative nos Latentes")
plt.legend()
plt.show()
# Clustering usando GaussianMixture
gmm = GaussianMixture(n_components=n_clusters, random_state=42)
gmm_labels = gmm.fit_predict(latent_space)
print(f"GMM labels: {np.unique(gmm_labels)}")
plt.figure(figsize=(8, 6))
for cluster_id in np.unique(gmm_labels):
mask = gmm_labels == cluster_id
plt.scatter(latent_space[mask, 0], latent_space[mask, 1], label=f"Cluster {cluster_id}", alpha=0.7)
plt.xlabel("Latent X")
plt.ylabel("Latent Y")
plt.title("Gaussian Mixture nos Latentes")
plt.legend()
plt.show()
# Clustering methods that do not require specifying the number of clusters
# DBSCAN já foi feito acima
# OPTICS clustering
optics = OPTICS(min_samples=10, xi=0.05, min_cluster_size=0.05)
optics_labels = optics.fit_predict(latent_space)
print(f"OPTICS labels: {np.unique(optics_labels)}")
plt.figure(figsize=(8, 6))
for cluster_id in np.unique(optics_labels):
mask = optics_labels == cluster_id
if cluster_id == -1:
plt.scatter(latent_space[mask, 0], latent_space[mask, 1], label="RuÃdo", alpha=0.5, c="k")
else:
plt.scatter(latent_space[mask, 0], latent_space[mask, 1], label=f"Cluster {cluster_id}", alpha=0.7)
plt.xlabel("Latent X")
plt.ylabel("Latent Y")
plt.title("OPTICS nos Latentes")
plt.legend()
plt.show()
Carregando dataset... Tier: ['professional'], Duração: 30-120 minutos Patches: 7.36 (10844) Carregando dataset... Tier: ['professional'], Duração: 30-120 minutos Patches: 7.33 (9915) Carregando dataset... Tier: ['professional'], Duração: 30-120 minutos Patches: 7.31 (18289) Carregando dataset... Tier: ['professional'], Duração: 30-120 minutos Patches: 7.29 (9756),7.28 (5453) Loading pre-trained model for TI 2024 Modelo carregado de ti_2024_autoencoder.h5 Loading pre-trained model for TI 2023 Modelo carregado de ti_2023_autoencoder.h5 Loading pre-trained model for TI 2022 Modelo carregado de ti_2022_autoencoder.h5 Loading pre-trained model for TI 2021 Modelo carregado de ti_2021_autoencoder.h5 Processing 97 matches from The International 2024 Autoencoder name: ti_2024_autoencoder Total similarity: -0.013836211899353057 Latents shape: (97, 2) Cluster labels: [0 1 2 3]
DBSCAN labels: [0]
Agglomerative labels: [0 1 2 3]
GMM labels: [0 1 2 3]
OPTICS labels: [-1 0 1 2]
Processing 97 matches from The International 2024 Autoencoder name: ti_2023_autoencoder Total similarity: -0.005962669753383116 Latents shape: (97, 2) Cluster labels: [0 1 2 3]
DBSCAN labels: [0]
Agglomerative labels: [0 1 2 3]
GMM labels: [0 1 2 3]
OPTICS labels: [-1 0 1]
Processing 97 matches from The International 2024 Autoencoder name: ti_2022_autoencoder Total similarity: -0.0013807835729466272 Latents shape: (97, 2) Cluster labels: [0 1 2 3]
DBSCAN labels: [0]
Agglomerative labels: [0 1 2 3]
GMM labels: [0 1 2 3]
OPTICS labels: [-1 0]
Processing 97 matches from The International 2024 Autoencoder name: ti_2021_autoencoder Total similarity: -0.02106190641823503 Latents shape: (97, 2) Cluster labels: [0 1 2 3]
DBSCAN labels: [-1 0]
Agglomerative labels: [0 1 2 3]
GMM labels: [0 1 2 3]
OPTICS labels: [-1 0 1]
Processing 132 matches from The International 2023 Autoencoder name: ti_2024_autoencoder Total similarity: -0.018873816610059956 Latents shape: (132, 2) Cluster labels: [0 1 2 3]
DBSCAN labels: [0]
Agglomerative labels: [0 1 2 3]
GMM labels: [0 1 2 3]
OPTICS labels: [-1 0 1 2]
Processing 132 matches from The International 2023 Autoencoder name: ti_2023_autoencoder Total similarity: -0.003968824745353424 Latents shape: (132, 2) Cluster labels: [0 1 2 3]
DBSCAN labels: [0]
Agglomerative labels: [0 1 2 3]
GMM labels: [0 1 2 3]
OPTICS labels: [-1 0 1 2]
Processing 132 matches from The International 2023 Autoencoder name: ti_2022_autoencoder Total similarity: -0.004640101060045488 Latents shape: (132, 2) Cluster labels: [0 1 2 3]
DBSCAN labels: [0]
Agglomerative labels: [0 1 2 3]
GMM labels: [0 1 2 3]
OPTICS labels: [-1 0 1 2]
Processing 132 matches from The International 2023 Autoencoder name: ti_2021_autoencoder Total similarity: -0.013166872389388807 Latents shape: (132, 2) Cluster labels: [0 1 2 3]
DBSCAN labels: [0]
Agglomerative labels: [0 1 2 3]
GMM labels: [0 1 2 3]
OPTICS labels: [-1 0 1 2]
Processing 195 matches from The International 2022 Autoencoder name: ti_2024_autoencoder Total similarity: -0.01694818826822134 Latents shape: (195, 2) Cluster labels: [0 1 2 3]
DBSCAN labels: [0]
Agglomerative labels: [0 1 2 3]
GMM labels: [0 1 2 3]
OPTICS labels: [-1 0 1 2]
Processing 195 matches from The International 2022 Autoencoder name: ti_2023_autoencoder Total similarity: -0.0043922345607708664 Latents shape: (195, 2) Cluster labels: [0 1 2 3]
DBSCAN labels: [-1 0]
Agglomerative labels: [0 1 2 3]
GMM labels: [0 1 2 3]
OPTICS labels: [-1 0 1 2 3]
Processing 195 matches from The International 2022 Autoencoder name: ti_2022_autoencoder Total similarity: 0.0002482656150674209 Latents shape: (195, 2) Cluster labels: [0 1 2 3]
DBSCAN labels: [-1 0]
Agglomerative labels: [0 1 2 3]
GMM labels: [0 1 2 3]
OPTICS labels: [-1 0 1 2 3]
Processing 195 matches from The International 2022 Autoencoder name: ti_2021_autoencoder Total similarity: -0.015873160652625257 Latents shape: (195, 2) Cluster labels: [0 1 2 3]
DBSCAN labels: [0]
Agglomerative labels: [0 1 2 3]
GMM labels: [0 1 2 3]
OPTICS labels: [-1 0]
Processing 758 matches from The International 2021 Autoencoder name: ti_2024_autoencoder Total similarity: -0.010965793479558346 Latents shape: (758, 2) Cluster labels: [0 1 2 3]
DBSCAN labels: [0]
Agglomerative labels: [0 1 2 3]
GMM labels: [0 1 2 3]
OPTICS labels: [-1 0 1 2 3]
Processing 758 matches from The International 2021 Autoencoder name: ti_2023_autoencoder Total similarity: -0.0013781932937005895 Latents shape: (758, 2) Cluster labels: [0 1 2 3]
DBSCAN labels: [-1 0]
Agglomerative labels: [0 1 2 3]
GMM labels: [0 1 2 3]
OPTICS labels: [-1 0 1 2]
Processing 758 matches from The International 2021 Autoencoder name: ti_2022_autoencoder Total similarity: -0.001249075820546666 Latents shape: (758, 2) Cluster labels: [0 1 2 3]
DBSCAN labels: [0]
Agglomerative labels: [0 1 2 3]
GMM labels: [0 1 2 3]
OPTICS labels: [-1 0 1 2 3]
Processing 758 matches from The International 2021 Autoencoder name: ti_2021_autoencoder Total similarity: -0.012774305904682206 Latents shape: (758, 2) Cluster labels: [0 1 2 3]
DBSCAN labels: [0]
Agglomerative labels: [0 1 2 3]
GMM labels: [0 1 2 3]
OPTICS labels: [-1 0 1]